Input

Get list of articles and of MeSH terms

# Read json with MeSH terms by paper
mesh_terms_by_article_list = read_json('./../Data/SFARI/MeSH_terms_by_paper.json', simplifyVector = TRUE)

Preprocess MeSH terms

  • Remove subheadings/qualifiers

  • Separate MeSH terms with commas into two different terms

qualifiers = c('abnormalities', 'administration & dosage', 'adverse effects', 'analogs & derivatives', 'analysis', 
               'anatomy & histology', 'antagonists & inhibitors', 'biosynthesis', 'blood supply', 'cerebrospinal fluid',
               'chemical synthesis', 'chemically induced', 'classification', 'complications', 'congenital',
               'cytology', 'deficiency', 'diagnosis', 'diagnostic imaging', 'diet therapy', 'drug effects', 'drug therapy',
               'economics', 'education', 'embryology', 'enzymology', 'epidemiology', 'ethics', 'ethnology', 'etiology',
               'growth & development', 'history', 'immunology', 'injuries', 'innervation', 'instrumentation',
               'isolation & purification', 'legislation & jurisprudence', 'manpower', 'metabolism', 'methods', 'microbiology',
               'mortality', 'nursing', 'organization & administration', 'parasitology', 'pathology', 'pharmacokinetics',
               'pharmacology', 'physiology', 'physiopathology', 'poisoning', 'prevention & control', 'psychology',
               'radiation effects', 'radiotherapy', 'rehabilitation', 'secondary', 'secretion', 'standards',
               'statistics & numerical data', 'supply & distribution', 'surgery', 'therapeutic use', 'therapy', 'toxicity',
               'transmission', 'trends', 'ultrastructure', 'urine', 'utilization', 'veterinary', 'virology')

qualifiers_all = c(qualifiers, 'agonists', 'blood', 'chemistry', 'genetics')

qualifiers_regex = paste(qualifiers, collapse='|')

for(article in names(mesh_terms_by_article_list)){
 
  mesh_terms = mesh_terms_by_article_list[[article]]
  new_mesh_terms = c()
  
  if(!is.na(mesh_terms[1])){
    
    for(mesh_term in mesh_terms){

      # Look for simple qualifiers and delete them
      match_general = grepl(qualifiers_regex, mesh_term)
      match_agonists = grepl('agonists', mesh_term) & !grepl('ntagonists', mesh_term)
      match_blood = grepl('blood', mesh_term) & !grepl('blood supply', mesh_term)
      match_chemistry = grepl('chemistry', mesh_term) & !grepl('immunohistochemistry', mesh_term)
      match_genetics = grepl('genetics', mesh_term) & !grepl('Xgenetics', mesh_term)
      
      if(match_agonists) mesh_term = gsub('agonists', '', mesh_term)
      if(match_blood) mesh_term = gsub('blood', '', mesh_term)
      if(match_chemistry) mesh_term = gsub('chemistry', '', mesh_term)
      if(match_genetics) mesh_term = gsub('genetics', '', mesh_term)
      if(match_general) mesh_term = gsub(qualifiers_regex, '', mesh_term)
      
      # Separate MeSH terms by commas
      mesh_term = mesh_term %>% strsplit(', ') %>% unlist
      
      # Update
      new_mesh_terms = c(new_mesh_terms, mesh_term)
      
    }
  
    # Update article entries
    mesh_terms_by_article_list[[article]] = unique(new_mesh_terms)
  
  }
  
}

rm(qualifiers, qualifiers_all, qualifiers_regex, article, mesh_term, mesh_terms, new_mesh_terms,
   match_general, match_agonists, match_blood, match_genetics)
# Get list of all mesh terms
all_mesh_terms = c()
n = 0

for(mesh_terms_in_article in mesh_terms_by_article_list){
  all_mesh_terms = unique(c(all_mesh_terms, mesh_terms_in_article))
  n = c(n, length(all_mesh_terms))
}

mesh_terms_counts = data.frame('articles' = seq(1,length(n)), 'mesh_terms'=n)

print(paste0('Articles: ', length(mesh_terms_by_article_list), '       Mesh terms: ', length(all_mesh_terms)))
## [1] "Articles: 3709       Mesh terms: 4271"
ggplotly(mesh_terms_counts %>% ggplot(aes(articles, mesh_terms, color='#434343')) + geom_line() +
         geom_smooth(method='lm', color='#666666', se=FALSE, size=0.5) + theme_minimal() + theme(legend.position='none') +
         ggtitle('Increase in number of MeSH terms by each new article'))
rm(mesh_terms_in_article, n, mesh_terms_counts)

Create article-MeSH term matrix

all_articles = names(mesh_terms_by_article_list)

article_mesh_term_mat = data.frame(matrix(0, nrow=length(all_articles), ncol=sum(!is.na(all_mesh_terms))))
rownames(article_mesh_term_mat) = all_articles
colnames(article_mesh_term_mat) = all_mesh_terms[!is.na(all_mesh_terms)]

for(article in all_articles){
  article_mesh_terms = mesh_terms_by_article_list[[article]]
  if(sum(is.na(article_mesh_terms))==0){
    article_mesh_term_mat[article, article_mesh_terms] = 1
  }
}

rm(mesh_terms_by_article_list, article, article_mesh_terms)

An important proportion of articles (almost 10%) don’t have MeSH terms

mesh_terms_by_article = data.frame('id' = rownames(article_mesh_term_mat), 'n_mesh_terms' = rowSums(article_mesh_term_mat))

bins = max(mesh_terms_by_article$n_mesh_terms)-min(mesh_terms_by_article$n_mesh_terms)+1
ggplotly(mesh_terms_by_article %>% ggplot(aes(n_mesh_terms)) + geom_histogram(bins=bins, alpha=0.6) + 
              ggtitle('Distribution of number of MeSH terms by article') + theme_minimal())
rm(bins)

Most MeSH terms are present in only a few articles

articles_by_mesh_term = data.frame('id' = as.character(colnames(article_mesh_term_mat)),
                                   'n_articles' = colSums(article_mesh_term_mat), stringsAsFactors = FALSE)

bins = max(articles_by_mesh_term$n_articles)-min(articles_by_mesh_term$n_articles)+1
ggplotly(articles_by_mesh_term %>% ggplot(aes(n_articles)) + geom_histogram(bins=bins, alpha=0.6) + 
         scale_x_sqrt() + scale_y_sqrt() + theme_minimal() +
         ggtitle('Distribution of number of articles that contain each MeSH term'))
rm(bins)

Many of the most popular MeSH terms aren’t useful

top_n_counts = 50
top_mesh_terms = articles_by_mesh_term %>% top_n(top_n_counts, wt=n_articles)

ggplotly(top_mesh_terms %>% ggplot(aes(reorder(id, n_articles), n_articles, fill=n_articles)) + 
         geom_bar(stat='identity') + scale_fill_viridis_c() + coord_flip() + 
         ggtitle(paste0('Top ', top_n_counts, ' MeSH Terms')) + 
         xlab('') + ylab('No. Articles') + theme_minimal() + theme(legend.position='none'))
rm(mesh_terms_by_article)

Filters:

  • Articles that have no MeSH terms
keep_articles = rowSums(article_mesh_term_mat)>0
print(paste0(sum(!keep_articles), '/', nrow(article_mesh_term_mat), ' articles have no MeSH terms'))
## [1] "363/3709 articles have no MeSH terms"


  • MeSH terms only present in a single article
keep_mesh_terms = colSums(article_mesh_term_mat)>1
print(paste0(sum(!keep_mesh_terms), '/', ncol(article_mesh_term_mat), ' MeSH terms are mentioned in a single article'))
## [1] "1714/4270 MeSH terms are mentioned in a single article"
article_mesh_term_mat = article_mesh_term_mat[keep_articles, keep_mesh_terms]
articles_by_mesh_term = articles_by_mesh_term %>% filter(id %in% colnames(article_mesh_term_mat))

print(paste0('Remaining Articles: ', nrow(article_mesh_term_mat), '       Mesh terms: ', ncol(article_mesh_term_mat)))
## [1] "Remaining Articles: 3346       Mesh terms: 2556"
rm(keep_articles, keep_mesh_terms)


  • Terms that are not related to the type of experiment that was conducted
keep_terms = c('abnormalit', 'acid', 'aging', 'allele', 'amin', 'analgesic', 'analysis', 'ancestry', 'astrocyte',
               'axon', 'behaviour', 'binding', 'biomark', 'blot', 'brain', 'case-control', 'cell', 'cereb', 
               'channel', 'chemistry', 'chromatin', 'chromosome', 'circadian', 'clon', 'cluster', 'codon', 
               'cognit', 'cohort', 'consanguinity', 'cortex', 'cpg', 'culture', 'dendrit', 'develop',
               'disease', 'disorder', 'dna', 'dopamin', 'drug', 'electro', 'enzym', 'epilepsy', 'etiology',
               'exome', 'exon', 'fibroblast', 'fluorescence', 'gabapentin', 'gen', 'haplo', 'heart', 'hippocamp', 
               'histon', 'hypothalamus', 'imaging', 'immun', 'in situ', 'in vitro', 'intron', 'kinase', 'knockout', 
               'link', 'megalencephaly', 'membran', 'mental', 'messenger', 'metabo', 'methyl', 'microcephaly',
               'microscop', 'microtubules', 'missense', 'model', 'molecul', 'muta', 'neuro', 'nonsense', 'nucleotide',
               'oxytocin', 'pair', 'pathway', 'peptid', 'pervasive', 'phenotype', 'phospor', 'polymerase', 
               'polymorphism', 'promoter', 'protein', 'psychiatric', 'receptor', 'recessive', 'region', 'regulat',
               'risk', 'rna', 'seizure', 'sequence', 'serotonin', 'sibling', 'signal', 'spectrometry',
               'splicing', 'statistics', 'striatum', 'synap', 'technique', 'trait loci', 'transcript',
               'transfection', 'transloc', 'tumor', 'vasopressin', 'zygote')

articles_by_mesh_term = articles_by_mesh_term %>% 
                        mutate('keep'=ifelse(grepl(paste(keep_terms,collapse='|'), tolower(id)), TRUE, FALSE)) %>%
                        filter(keep)

article_mesh_term_mat = article_mesh_term_mat %>% dplyr::select(articles_by_mesh_term$id)
article_mesh_term_mat = article_mesh_term_mat[rowSums(article_mesh_term_mat)>0, ]

print(paste0('Articles: ', nrow(article_mesh_term_mat), '       Mesh terms: ', ncol(article_mesh_term_mat)))
## [1] "Articles: 3345       Mesh terms: 1212"
rm(keep_terms)
top_n_counts = 50
top_mesh_terms = articles_by_mesh_term %>% top_n(top_n_counts, wt=n_articles)
ggplotly(top_mesh_terms %>% ggplot(aes(reorder(id, n_articles), n_articles, fill=n_articles)) + 
         geom_bar(stat='identity') + scale_fill_viridis_c() + coord_flip() + 
         ggtitle(paste0('New top ', top_n_counts, ' MeSH Terms')) + 
         xlab('') + ylab('No. Articles') + theme_minimal() + theme(legend.position='none'))
rm(top_n_counts, top_mesh_terms)

Network:

MeSH Terms Network

sparse_article_mesh_term_mat = Matrix(as.matrix(article_mesh_term_mat), sparse=TRUE)

# EDGES
mesh_mesh_mat = t(sparse_article_mesh_term_mat) %*% sparse_article_mesh_term_mat

edges = summary(mesh_mesh_mat) %>% rename('from'=i, 'to'=j, 'count'=x) %>%
        mutate(from = colnames(article_mesh_term_mat)[from],
               to = colnames(article_mesh_term_mat)[to]) %>%
        filter(from!=to)

# Convert count to fraction
article_mesh_term_colsums = colSums(article_mesh_term_mat)
names(article_mesh_term_colsums) = colnames(article_mesh_term_mat)
edges = edges %>% mutate('weight'=count/(article_mesh_term_colsums[from]+article_mesh_term_colsums[to]))

# NODES
nodes = articles_by_mesh_term %>% mutate('id'=as.character(id), 'title'=as.character(id), 'value'=n_articles)

print(paste0('Nodes: ', nrow(nodes), '        Edges: ', nrow(edges)/2))
## [1] "Nodes: 1212        Edges: 63574"
edges %>% ggplot(aes(count, weight)) + geom_point(alpha=0.1, color='#006666') + ylab('proportion') +
          ggtitle('Effects of the change from Count to Proportion on edge weights') + theme_minimal()

graph = graph_from_data_frame(edges, vertices=nodes, directed=FALSE)
graph = graph %>% simplify(edge.attr.comb='first')

eigen_centrality_output = eigen_centrality(graph)
eigencentr = as.numeric(eigen_centrality_output$vector)

plot_data = data.frame('id'=graph %>% get.vertex.attribute('name'), 'eigencentrality'=eigencentr) %>%
            top_n(50, wt=eigencentrality)
ggplotly(plot_data %>% ggplot(aes(reorder(id, eigencentrality), eigencentrality, fill=eigencentrality)) + 
         geom_bar(stat='identity') + scale_fill_viridis_c() + coord_flip() + 
         ggtitle(paste0('MeSH terms with the highest Network centrality')) + 
         xlab('') + ylab('Eigencentrality') + theme_minimal() + theme(legend.position='none'))
rm(plot_data)

Clustering

comms_louvain = cluster_louvain(graph)
modules = comms_louvain %>% membership

color_pal = viridis_pal()(max(modules)) %>% substr(1,7)
graph = graph %>% set_vertex_attr('eigencentrality', value=eigencentr) %>%
                  set_vertex_attr('module', value=as.numeric(modules)) %>% 
                  set_vertex_attr('color_modules', value=color_pal[as.numeric(modules)]) %>%
                  set_vertex_attr('color', value=color_pal[as.numeric(modules)])

comm_sizes = sizes(comms_louvain) %>% data.frame %>% rename(Module=Community.sizes, size=Freq)
ggplotly(comm_sizes %>% ggplot(aes(Module, size, fill=size)) + geom_bar(stat='identity') + 
         ggtitle('Number of MeSH Terms in each module') + theme_minimal() + theme(legend.position = 'none'))
visIgraph(graph) %>% visOptions(selectedBy='module', highlightNearest=TRUE) %>% 
                     visIgraphLayout(randomSeed=123)
rm(color_pal, eigen_centrality_output, eigencentr, comm_sizes, edges, nodes)

Most important MeSH terms for each module (chosen by eigencentrality):

  • Module 1: Chemistry terms?

  • Module 2: MAP Kinase / Proteins

  • Module 3: Proteins

  • Module 4: 2 nodes: Genomic library + Early growth response protein 2

  • Module 5: AA Transport and Faty acid-binding proteins

  • Module 6: TATA Box (TFIID binds to the TATA box)

  • Module 7: DNA Sequencing, Mutations, Autism

  • Module 8: Proteins, heart

  • Module 9: Medicine / Clinical?

  • Module 10: 2 nodes: TH1 and TH2 Cells

  • Module 11: Immune system

  • Module 12: Neuroscience

  • Module 13: Human leukocyte antigens < Immune system

  • Module 14: Dopamine, serotonine, methamphetamine

  • Module 15: Trancription Factors, chromosome pairs

module_info = tibble('module'=character(), 'top_term'=character(), size=integer())

for(i in names(sizes(comms_louvain))){
  
  module_vertices = names(modules)[modules==i]
  
  # Creat subgraph
  module_graph = induced_subgraph(graph, module_vertices)
  
  # Calculate eigencentrality
  eigen_centrality_output = eigen_centrality(module_graph)
  eigencentr = as.numeric(eigen_centrality_output$vector)
  names(eigencentr) = module_vertices
  
  module_info = module_info %>% add_row('module'=i, 'top_term'=names(eigencentr)[eigencentr==max(eigencentr)][1], 
                                        'size'=length(module_vertices))
  
  # Plot eigencentrality of top 10 MeSH terms
  plot_data = data.frame('mesh_term'=names(eigencentr), 'eigencentrality'=eigencentr) %>%
              top_n(10, wt=eigencentrality)
  print(plot_data %>% ggplot(aes(reorder(mesh_term, eigencentrality), eigencentrality, fill=eigencentrality)) + 
       geom_bar(stat='identity') + scale_fill_viridis_c() + coord_flip() + 
       ggtitle(paste0('Top terms for Module ', i)) + 
       xlab('MeSH Terms') + ylab('Eigencentrality') + theme_minimal() + theme(legend.position='none'))

}

rm(i, module_vertices, module_graph, eigen_centrality_output, eigencentr, plot_data, comms_louvain)

Module Network

  • Collapsing edges: sum all original weights and divide by the product of the number of elements in both (the number of possible interactions they could have)

  • The Network plot is not that informative

module_graph = contract(graph, graph %>% get.vertex.attribute('module'), vertex.attr.comb='first') %>%
               simplify(edge.attr.comb='sum') %>% set_vertex_attr('name', value=module_info$module) %>%
                                                  set_vertex_attr('title', value=module_info$top_term) %>%
                                                  set_vertex_attr('module_size', value=module_info$size) %>%
                                                  set_vertex_attr('value', value=module_info$size)

nodes_module_graph  = module_graph %>% as_data_frame(what='vertices') %>% mutate('id'=name)
edges_module_graph = module_graph %>% as_data_frame() %>% 
                     mutate('corrected_weight' = weight/(nodes_module_graph$module_size[as.numeric(from)]*
                                                 nodes_module_graph$module_size[as.numeric(to)])*100)

module_graph = module_graph %>% set_edge_attr('weight', value=edges_module_graph$corrected_weight)

visIgraph(module_graph) %>% visIgraphLayout(randomSeed=123) %>% visOptions(highlightNearest=TRUE)
rm(module_info, nodes_module_graph, edges_module_graph)
adj_mat = module_graph %>% as_adjacency_matrix(attr='weight') %>% as.matrix
adj_mat = adj_mat

heatmap.2(adj_mat, cellnote=round(adj_mat,2), dendrogram='none', notecol='gray', scale='none', trace='none',
          key=FALSE, xlab='Modules', ylab='Modules', col=colorRampPalette(brewer.pal(8, 'RdPu'))(25))

rm(adj_mat)

Save file with the modules each paper belongs to through its mesh terms

mesh_term_module = graph %>% as_data_frame(what='vertices') %>% dplyr::select(name, module)

mesh_term_module_mat = matrix(0, nrow(mesh_term_module), max(mesh_term_module$module))
rownames(mesh_term_module_mat) = mesh_term_module$name
colnames(mesh_term_module_mat) = sort(unique(mesh_term_module$module))

for(i in seq(1, nrow(mesh_term_module))){
  mesh_term_module_mat[i,mesh_term_module[i,2]] = 1
}

if(!all(colnames(article_mesh_term_mat) == rownames(mesh_term_module_mat))){
  print('Row and column order dont match!')
}

article_module_mat = as.matrix(article_mesh_term_mat) %*% mesh_term_module_mat
rownames(article_module_mat) = rownames(article_mesh_term_mat)

write.csv(article_module_mat, file='./../Data/SFARI/article_module_matrix_wo_qualifiers.csv')

print('Distribution of number of modules per article')
## [1] "Distribution of number of modules per article"
table(apply(article_module_mat, 1, function(x) sum(x>0)))
## 
##    1    2    3    4    5    6    7    8 
##  620 1195  938  417  150   21    3    1
melt_article_module = melt(article_module_mat)
colnames(melt_article_module) = c('articleID', 'module', 'value')
melt_article_module$module = as.factor(melt_article_module$module)

ggplotly(melt_article_module %>% ggplot(aes(value, fill=module, color=module)) +
         geom_histogram(position='identity', bins=max(melt_article_module$value), alpha=0.5) + 
         facet_grid(module~., scales='free_y') + ggtitle('Module content distribution per article') + 
         xlab('n_articles') + ylab('Module content') + theme_minimal() + theme(legend.position='none'))
rm(mesh_term_module, mesh_term_module_mat, i)